#############################################################################
#Nearest Neighbour Reference Site selection
##############################################################################
#' Match test sites with reference sites
#'
#' Methods for matching test sites with nearest neighbour reference sites based on ecological similarity. The Assessment by Nearest-Neighbour Analysis (ANNA)
#' and Redundancy Analysis ANNA (RDA-ANNA) are implimented as in (\url{http://www.bioone.org/doi/abs/10.1086/678702}). Ecological distance
#' are calculated as Euchlidean distance across significant ordination axis. Axis are standardized and weighted according the proportion
#' of variance explained. Significance of axis is determined by the broken-stick method for ANNA and a permutation test for RDA-ANNA.
#'
#' An option for adaptive selection of k of based on dynamic 1-dimensional k-means clustering implimented in \code{\link[Ckmeans.1d.dp]{Ckmeans.1d.dp}}
#' is available, otherwise k must be supplied.
#'
#' @param Test Vector of environmental variables at Test site
#' @param Reference Data frame of environmental Variables at Reference sites
#' @param k Numeric, the numer of nearest neighbour reference sites to find. Ignored if NULL
#' @param adaptive Logical, whether to use adaptive breaks to select nearest neighbour reference sites.
#' @param ad.factor Power factor used in the inverse-power function if adaptive==T
#' @param ad.constant Constant factor used in the inverse-power function if adaptive==T
#' @param RDA.reference Data frame containing indicator metric data at reference sites. Row names must match with Reference. If it is supplied the function will perform RDA-ANNA, otherwise ANNA.
#' @return $final.dist - Vector containing nearest neighbour reference sites to test site as well as the ecological distance of the test site to each reference site.
#' @return $method - Whether ANNA or RDA-ANNA was used
#' @return $k - The number of reference sites requested or "Adaptive"
#' @return $all.dist - The ecological distance of all reference sites to the test site
#' @return $ref.scores - Ordination axis scores of reference sites
#' @return $test.scores - Ordination axis scores of test site
#' @keywords Test Site Analysis, ANNA
#' @export
#' @examples
#' #load datasets
#' data(YKEnvData,envir = environment()) #Biological dataset
#' data(YKBioData,envir = environment()) #Environmental dataset
#'
#' #Calculate indicator metrics from raw biological data
#' bio.data.test<-benth.met(YKBioData,2,2)
#'
#' #Extract just the summary metrics
#' bio.data<-bio.data.test$Summary.Metrics
#'
#' #standardize row names between datasets
#' rownames(YKEnvData)<-bio.data.test$Site.List
#'
#' #Match a test site (#201) to the nearest neighbour reference set
#' nn.sites<-site.match(YKEnvData[201,-c(1)],YKEnvData[1:118,-c(1)],k=NULL,adaptive=T)
#'
#' #Evaluate nearest neighbour selections
#' plot(nn.sites)
#' sitematch.plot(nn.sites)
site.match<-function(Test, Reference, k=NULL, adaptive=T, ad.factor=2, ad.constant=1, RDA.reference=NULL) {
if (any(colnames(Test)%in%colnames(Reference)==F)|
(ncol(Test)!=ncol(Reference))) {
stop("Variable name mismatch between test site and reference set")
}
if (adaptive==F & is.null(k)){
k=nrow(Reference)
}
Reference<-as.matrix(Reference)
Reference<-Reference[,apply(Reference,2, min)-apply(Reference,2, max)!=0]
Reference<-Reference[,!is.na(colnames(Reference))]
Test<-Test[,colnames(Reference)]
if (is.null(RDA.reference)){
anna.ref<-prcomp(Reference,center = TRUE, scale = T)
sig<-length(which(eigenvals(anna.ref)>bstick(anna.ref)))
anna.test.x<-predict(anna.ref,Test)
anna.test<-(anna.test.x - apply(anna.ref$x,2,min)) / (apply(anna.ref$x,2,max) - apply(anna.ref$x,2,min))
anna.ref.x<-decostand(anna.ref$x,method="range")
sig<-min(sig, ncol(anna.ref.x))
temp<-rbind(sweep(sweep(anna.ref.x,2,anna.test,"-"),2, eigenvals(anna.ref)/sum(eigenvals(anna.ref)),"*")[,1:sig],rep(0,sig))
anna.dist<- sort(as.matrix(dist(temp))[,nrow(Reference)+1])
anna.dist<-anna.dist[-c(which(names(anna.dist)==""))]
}
if (!is.null(RDA.reference)){
if (any((rownames(RDA.reference)==rownames(Reference))==F)) {
stop("site name mismatch between RDA.reference and Reference")
}
if (ncol(RDA.reference)>(ncol(Reference)-1)){
stop("Too many metrics for number of environmental variables")
#RDA.reference<-RDA.reference[,colSums(RDA.reference[1:nrow(Reference),])!=0]
#rda.pca<-prcomp(RDA.reference,center = TRUE, scale = T)
#sig<-length(which(cumsum(eigenvals(rda.pca)/sum(eigenvals(rda.pca)))<0.95))
#RDA.reference<-rda.pca$x[,1:sig]
}
if (ncol(RDA.reference)>(nrow(Reference-1))){
stop("Too many metrics selected for number of Reference Sites")
}
if (ncol(RDA.reference)>((1/2)*nrow(Reference-1))){
warning("Number of indicator metrics greater than 1/2 number of reference sites")
}
RDA.reference<-scale(RDA.reference,T,T)[,]
RDA.reference[is.nan(RDA.reference)]<-0
RDA.reference<-RDA.reference[,colSums(RDA.reference[1:nrow(Reference),])!=0]
anna.ref<-rda(scale(Reference,T,T)[,],RDA.reference,scale=F)
sig.pca<-prcomp(Reference,center = T, scale = T)
sig<-length(which(eigenvals(sig.pca)>bstick(sig.pca)))
#sig<-length(which(anova(rda(x~RDA.reference,scale=F,data=mydf),by="axis")[,4]<0.05))
#sig<-length(which(anova(anna.ref,by="axis")[,4]<0.05))
anna.test.x<-predict(object=anna.ref,newdata=(Test-colMeans(Reference))/apply(Reference,2,sd),model="CCA",type="wa")
anna.test<-(anna.test.x - apply(anna.ref$CCA$wa,2,min)) / (apply(anna.ref$CCA$wa,2,max) - apply(anna.ref$CCA$wa,2,min))
anna.ref.x<-decostand(anna.ref$CCA$wa,method="range")
sig<-min(sig, ncol(anna.ref.x))
temp<-rbind(sweep(sweep(anna.ref.x,2,anna.test,"-"),2, anna.ref$CCA$eig/sum(anna.ref$CCA$eig),"*")[,1:sig],rep(0,sig))
anna.dist<-sort(as.matrix(dist(temp))[,nrow(Reference)+1])
anna.dist<-anna.dist[-c(which(names(anna.dist)==""))]
}
final.dist<-NULL
if (adaptive==T) {
l<-NULL
for (i in 3:if (is.null(k)) length(anna.dist) else k){
l[i]<-diff(anna.dist[1:i],lag=1)[i-2]
if (l[i]>(1/(i^ad.factor))){
break
}
}
final.dist<-anna.dist[1:length(l)]
#if (k==F){
# final.dist<-anna.dist[1:Ckmeans.1d.dp(anna.dist[1:nrow(Reference)])$size[1]]
#}
#else {
# final.dist<-anna.dist[1:Ckmeans.1d.dp(anna.dist[1:nrow(Reference)])$size[1]]
#}
#if (length(final.dist)>35){
# final.dist<-anna.dist[1:Ckmeans.1d.dp(final.dist,2)$size[1]]
#}
} else {
l<-NULL
for (i in 3:if (is.null(k)) length(anna.dist) else k){
l[i]<-diff(anna.dist[1:i],lag=1)[i-2]
if (l[i]>(ad.constant*(1/(i^ad.factor)))){
break
}
}
if (mean(anna.dist[1:k])>mean(anna.dist[1:length(l)])) {
warning("Some reference sites may be poorly matched ecologically to test site. Examine plot for confirmation. Consider using adaptive nearest-neighbour breaks.")
}
final.dist<-anna.dist[1:k]
}
if (final.dist[1]>mean(final.dist)){
warning("Test site may not match reference set ecologically. Examine plot for confirmation. Consider additional reference sites, or additional ecological data.")
}
output<-NULL
output$final.dist<-final.dist
output$method<- if (is.null(RDA.reference)) {"ANNA"} else {"RDA-ANNA"}
output$adaptive<- if (adaptive) {TRUE} else {FALSE}
output$k<- k
output$sig.axis<-sig
output$all.dist<-anna.dist
output$ref.scores<-anna.ref
output$test.scores<-anna.test.x
output$env.data<-rbind(Reference,Test)
class(output)<-"match.object"
return(output)
}
#' @export
print.match.object<-function(match.object){
cat(paste0("Method: ",match.object$method, " with ", match.object$sig.axis, " significant axes"),"\n\n")
if (!is.null(match.object$k) & match.object$adaptive==T) {
cat(paste0("Adaptive threshold with ",match.object$k," upper maximum nearest neigbours."))
}
if (is.null(match.object$k) & match.object$adaptive==T) {
cat(paste0("Adaptive threshold with no upper maximum\n\n"))
}
if (!is.null(match.object$k) & match.object$adaptive==F) {
cat(paste0("Fixed threshold of ",match.object$k, " nearest neighbours\n\n"))
}
cat("Selected nearest neighbour reference sites:\n")
print(match.object$final.dist)
}
#' @export
summary.match.object<-function(match.object){
print(match.object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.